/*
 * Copyright (C) 2009 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <assert.h>
#include <math.h>
#include <stdio.h>
#include <string.h>
#include <time.h>
#include "mystdlib.h"
#include "ngram.h"
#include <QDebug>

namespace ime_pinyin
{

#define ADD_COUNT 0.3

	int comp_double(const void *p1, const void *p2)
	{
		if (*static_cast<const double *>(p1) < *static_cast<const double *>(p2)) {
			return -1;
		}
		if (*static_cast<const double *>(p1) > *static_cast<const double *>(p2)) {
			return 1;
		}
		return 0;
	}

	inline double distance(double freq, double code)
	{
		// return fabs(freq - code);
		return freq * fabs(log(freq) - log(code));
	}

	// Find the index of the code value which is nearest to the given freq
	int qsearch_nearest(double code_book[], double freq, int start, int end)
	{
		if (start == end) {
			return start;
		}

		if (start + 1 == end) {
			if (distance(freq, code_book[end]) > distance(freq, code_book[start])) {
				return start;
			}
			return end;
		}

		int mid = (start + end) / 2;

		if (code_book[mid] > freq) {
			return qsearch_nearest(code_book, freq, start, mid);
		} else {
			return qsearch_nearest(code_book, freq, mid, end);
		}
	}

	size_t update_code_idx(double freqs[], size_t num, double code_book[],
	                       CODEBOOK_TYPE *code_idx)
	{
		size_t changed = 0;
		for (size_t pos = 0; pos < num; pos++) {
			CODEBOOK_TYPE idx;
			idx = qsearch_nearest(code_book, freqs[pos], 0, kCodeBookSize - 1);
			if (idx != code_idx[pos]) {
				changed++;
			}
			code_idx[pos] = idx;
		}
		return changed;
	}

	double recalculate_kernel(double freqs[], size_t num, double code_book[],
	                          CODEBOOK_TYPE *code_idx)
	{
		double ret = 0;

		size_t *item_num =  new size_t[kCodeBookSize];
		assert(item_num);
		memset(item_num, 0, sizeof(size_t) * kCodeBookSize);

		double *cb_new = new double[kCodeBookSize];
		assert(cb_new);
		memset(cb_new, 0, sizeof(double) * kCodeBookSize);

		for (size_t pos = 0; pos < num; pos++) {
			ret += distance(freqs[pos], code_book[code_idx[pos]]);

			cb_new[code_idx[pos]] += freqs[pos];
			item_num[code_idx[pos]] += 1;
		}

		for (size_t code = 0; code < kCodeBookSize; code++) {
			assert(item_num[code] > 0);
			code_book[code] = cb_new[code] / item_num[code];
		}

		delete [] item_num;
		delete [] cb_new;

		return ret;
	}

	void iterate_codes(double freqs[], size_t num, double code_book[],
	                   CODEBOOK_TYPE *code_idx)
	{
		size_t iter_num = 0;
		double delta_last = 0;
		do {
			size_t changed = update_code_idx(freqs, num, code_book, code_idx);

			double delta = recalculate_kernel(freqs, num, code_book, code_idx);

			if (kPrintDebug0) {
				printf("---Unigram codebook iteration: %zd : %zd, %.9f\n",
				       iter_num, changed, delta);
			}
			iter_num++;

			if (iter_num > 1 &&
			        (delta == 0 || fabs(delta_last - delta) / fabs(delta) < 0.000000001)) {
				break;
			}
			delta_last = delta;
		} while (true);
	}


	NGram *NGram::instance_ = NULL;

	NGram::NGram()
	{
		initialized_ = false;
		idx_num_ = 0;
		lma_freq_idx_ = NULL;
		sys_score_compensation_ = 0;

#ifdef ___BUILD_MODEL___
		freq_codes_df_ = NULL;
#endif
		freq_codes_ = NULL;
	}

	NGram::~NGram()
	{
		if (NULL != lma_freq_idx_) {
			free(lma_freq_idx_);
		}

#ifdef ___BUILD_MODEL___
		if (NULL != freq_codes_df_) {
			free(freq_codes_df_);
		}
#endif

		if (NULL != freq_codes_) {
			free(freq_codes_);
		}
	}

	NGram &NGram::get_instance()
	{
		if (NULL == instance_) {
			instance_ = new NGram();
		}
		return *instance_;
	}

	bool NGram::save_ngram(FILE *fp)
	{
		if (!initialized_ || NULL == fp) {
			return false;
		}

		if (0 == idx_num_ || NULL == freq_codes_ ||  NULL == lma_freq_idx_) {
			return false;
		}

		if (fwrite(&idx_num_, sizeof(size_t), 1, fp) != 1) {
			return false;
		}

		if (fwrite(freq_codes_, sizeof(LmaScoreType), kCodeBookSize, fp) !=
		        kCodeBookSize) {
			return false;
		}

		if (fwrite(lma_freq_idx_, sizeof(CODEBOOK_TYPE), idx_num_, fp) != idx_num_) {
			return false;
		}

		return true;
	}

//	bool NGram::load_ngram(FILE *fp)
    bool NGram::load_ngram(QFile &fp)
	{
        qDebug()<<"load_ngram";
//		if (NULL == fp) {
//			return false;
//		}
        if (!fp.isOpen()){
            return false;
        }

		initialized_ = false;

//		if (fread(&idx_num_, sizeof(size_t), 1, fp) != 1 ) {
//			return false;
//		}
        qint64 size = sizeof(size_t);
        if (fp.read((char*)&idx_num_, size) != size){
            return  false;
        }

		if (NULL != lma_freq_idx_) {
			free(lma_freq_idx_);
		}

		if (NULL != freq_codes_) {
			free(freq_codes_);
		}

		lma_freq_idx_ = static_cast<CODEBOOK_TYPE *>
		                (malloc(idx_num_ * sizeof(CODEBOOK_TYPE)));
		freq_codes_ = static_cast<LmaScoreType *>
		              (malloc(kCodeBookSize * sizeof(LmaScoreType)));

		if (NULL == lma_freq_idx_ || NULL == freq_codes_) {
			return false;
		}

//		if (fread(freq_codes_, sizeof(LmaScoreType), kCodeBookSize, fp) !=
//		        kCodeBookSize) {
//			return false;
//		}

//		if (fread(lma_freq_idx_, sizeof(CODEBOOK_TYPE), idx_num_, fp) != idx_num_) {
//			return false;
//		}
        size = sizeof(LmaScoreType) * kCodeBookSize;
        if (fp.read((char*)freq_codes_, size) != size){
            return false;
        }
        size = sizeof(CODEBOOK_TYPE) * idx_num_;
        if (fp.read((char*)lma_freq_idx_, size) != size){
            return false;
        }

		initialized_ = true;

		total_freq_none_sys_ = 0;
          qDebug()<<"load_ngram ok";
		return true;
	}

	void NGram::set_total_freq_none_sys(size_t freq_none_sys)
	{
		total_freq_none_sys_ = freq_none_sys;
		if (0 == total_freq_none_sys_) {
			sys_score_compensation_ = 0;
		} else {
			double factor = static_cast<double>(kSysDictTotalFreq) / (
			                    kSysDictTotalFreq + total_freq_none_sys_);
			sys_score_compensation_ = static_cast<float>(
			                              log(factor) * kLogValueAmplifier);
		}
	}

	// The caller makes sure this oject is initialized.
	float NGram::get_uni_psb(LemmaIdType lma_id)
	{
		return  static_cast<float>(freq_codes_[lma_freq_idx_[lma_id]]) +
		        sys_score_compensation_;
	}

	float NGram::convert_psb_to_score(double psb)
	{
		float score = static_cast<float>(
		                  log(psb) * static_cast<double>(kLogValueAmplifier));
		if (score > static_cast<float>(kMaxScore)) {
			score = static_cast<float>(kMaxScore);
		}
		return score;
	}

#ifdef ___BUILD_MODEL___
	bool NGram::build_unigram(LemmaEntry *lemma_arr, size_t lemma_num,
	                          LemmaIdType next_idx_unused)
	{
		if (NULL == lemma_arr || 0 == lemma_num || next_idx_unused <= 1) {
			return false;
		}

		double total_freq = 0;
		double *freqs = new double[next_idx_unused];
		if (NULL == freqs) {
			return false;
		}

		freqs[0] = ADD_COUNT;
		total_freq += freqs[0];
		LemmaIdType idx_now = 0;
		for (size_t pos = 0; pos < lemma_num; pos++) {
			if (lemma_arr[pos].idx_by_hz == idx_now) {
				continue;
			}
			idx_now++;

			assert(lemma_arr[pos].idx_by_hz == idx_now);

			freqs[idx_now] = lemma_arr[pos].freq;
			if (freqs[idx_now] <= 0) {
				freqs[idx_now] = 0.3;
			}

			total_freq += freqs[idx_now];
		}

		double max_freq = 0;
		idx_num_ = idx_now + 1;
		assert(idx_now + 1 == next_idx_unused);

		for (size_t pos = 0; pos < idx_num_; pos++) {
			freqs[pos] = freqs[pos] / total_freq;
			assert(freqs[pos] > 0);
			if (freqs[pos] > max_freq) {
				max_freq = freqs[pos];
			}
		}

		// calculate the code book
		if (NULL == freq_codes_df_) {
			freq_codes_df_ = new double[kCodeBookSize];
		}
		assert(freq_codes_df_);
		memset(freq_codes_df_, 0, sizeof(double) * kCodeBookSize);

		if (NULL == freq_codes_) {
			freq_codes_ = new LmaScoreType[kCodeBookSize];
		}
		assert(freq_codes_);
		memset(freq_codes_, 0, sizeof(LmaScoreType) * kCodeBookSize);

		size_t freq_pos = 0;
		for (size_t code_pos = 0; code_pos < kCodeBookSize; code_pos++) {
			bool found = true;

			while (found) {
				found = false;
				double cand = freqs[freq_pos];
				for (size_t i = 0; i < code_pos; i++)
					if (freq_codes_df_[i] == cand) {
						found = true;
						break;
					}
				if (found) {
					freq_pos++;
				}
			}

			freq_codes_df_[code_pos] = freqs[freq_pos];
			freq_pos++;
		}

		myqsort(freq_codes_df_, kCodeBookSize, sizeof(double), comp_double);

		if (NULL == lma_freq_idx_) {
			lma_freq_idx_ = new CODEBOOK_TYPE[idx_num_];
		}
		assert(lma_freq_idx_);

		iterate_codes(freqs, idx_num_, freq_codes_df_, lma_freq_idx_);

		delete [] freqs;

		if (kPrintDebug0) {
			printf("\n------Language Model Unigram Codebook------\n");
		}

		for (size_t code_pos = 0; code_pos < kCodeBookSize; code_pos++) {
			double log_score = log(freq_codes_df_[code_pos]);
			float final_score = convert_psb_to_score(freq_codes_df_[code_pos]);
			if (kPrintDebug0) {
				printf("code:%zd, probability:%.9f, log score:%.3f, final score: %.3f\n",
				       code_pos, freq_codes_df_[code_pos], log_score, final_score);
			}
			freq_codes_[code_pos] = static_cast<LmaScoreType>(final_score);
		}

		initialized_ = true;
		return true;
	}
#endif

}  // namespace ime_pinyin
